######################################################
##  Replication Paper                               ##
##  Gov 2001, Spring 2018                           ##
######################################################

# Load Packages
library(dplyr)
library(ggplot2)
library(stargazer)
library(haven)
library(foreign)
library(pglm)
library(sandwich)
library(MASS)
setwd("...") # change this to your working drive


##############################
# Determining Authorship Order
##############################

set.seed(02138)
authors <- c("Author 1", "Author 2", "Author 3")
sample(authors, 3, replace = F)

#############
# Import and Merge Data
#############

# Clear workspace and import data 
rm(list = ls()) # clear workspace
data <- read.csv("rep_paper_data.csv")

############
# Extensions 
############

# Check dispersion of DVs
DVs <- c("riots", "purges", "revolutions", "demonstrations", "assassinations", "strikes", "govcrises", "guerilla")
for (i in 1:length(DVs)){
  cat("Dispersion of", DVs[i], "is", var(data[,DVs[i]], na.rm = T)/mean(data[,DVs[i]], na.rm = T), " \n ")
}
# Takeaway our DVs are overdispersed. 

#####
# Create Lagged DVs to deal with autocorrelation
#####

data <- tbl_df(data) %>% group_by(cowcode) %>% 
  mutate(rev_lag1 = lag(revolutions),
         demo_lag1 = lag(demonstrations),
         riots_lag1 = lag(riots),
         purges_lag1 = lag(purges),
         ass_lag1 = lag(assassinations),
         str_lag1 = lag(strikes),
         govc_lag1 = lag(govcrises),
         guer_lag1 = lag(guerilla))


#######################
# Poisson and NB Models
#######################

# Subset data by DV for easy plotting 
riot_data <- tbl_df(data)%>% subset(select = c(riots, mdi, lgdpl, larea, lmtn, lpopl, oil2l, deml, deml2, ethfracl, relfracl, year, cowcode, riots_lag1)) %>% mutate(Lagged_DV = riots_lag1) 
purge_data <- tbl_df(data)%>% subset(select = c(purges, mdi, lgdpl, larea, lmtn, lpopl, oil2l, deml, deml2, ethfracl, relfracl, year, cowcode, purges_lag1)) %>% mutate(Lagged_DV = purges_lag1)  
rev_data <- tbl_df(data)%>% subset(select = c(revolutions, mdi, lgdpl, larea, lmtn, lpopl, oil2l, deml, deml2, ethfracl, relfracl, year, cowcode, rev_lag1)) %>% mutate(Lagged_DV = rev_lag1)  
demo_data <- tbl_df(data)%>% subset(select = c(demonstrations, mdi, lgdpl, larea, lmtn, lpopl, oil2l, deml, deml2, ethfracl, relfracl, year, cowcode, demo_lag1)) %>% mutate(Lagged_DV = demo_lag1)
ass_data <- tbl_df(data)%>% subset(select = c(assassinations, mdi, lgdpl, larea, lmtn, lpopl, oil2l, deml, deml2, ethfracl, relfracl, year, cowcode, ass_lag1)) %>% mutate(Lagged_DV = ass_lag1)
str_data <- tbl_df(data)%>% subset(select = c(strikes, mdi, lgdpl, larea, lmtn, lpopl, oil2l, deml, deml2, ethfracl, relfracl, year, cowcode, str_lag1)) %>% mutate(Lagged_DV = str_lag1)
govc_data <- tbl_df(data)%>%  subset(select = c(govcrises, mdi, lgdpl, larea, lmtn, lpopl, oil2l, deml, deml2, ethfracl, relfracl, year, cowcode, govc_lag1)) %>% mutate(Lagged_DV = govc_lag1)
guer_data <- tbl_df(data)%>%  subset(select = c(guerilla, mdi, lgdpl, larea, lmtn, lpopl, oil2l, deml, deml2, ethfracl, relfracl, year, cowcode, guer_lag1)) %>% mutate(Lagged_DV = guer_lag1)

# Poisson Model + Controls w/out Log GDP/PC
mod_riots_pois <- glm(riots ~ mdi + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = riot_data)
mod_purge_pois  <- glm(purges ~ mdi + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = purge_data)
mod_rev_pois  <- glm(revolutions ~ mdi + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = rev_data)
mod_demo_pois <- glm(demonstrations ~ mdi + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = demo_data)
mod_ass_pois  <- glm(assassinations ~ mdi + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = ass_data)
mod_str_pois  <- glm(strikes ~ mdi + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = str_data)
mod_govc_pois  <- glm(govcrises ~ mdi + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = govc_data)
mod_guer_pois  <- glm(guerilla ~ mdi + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = guer_data)

# Poisson Model + Controls + Log GDP/PC
mod_riots_cov_pois <- glm(riots ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = riot_data)
mod_purge_cov_pois  <- glm(purges ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = purge_data)
mod_rev_cov_pois  <- glm(revolutions ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = rev_data)
mod_demo_cov_pois <- glm(demonstrations ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = demo_data)
mod_ass_cov_pois  <- glm(assassinations ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = ass_data)
mod_str_cov_pois  <- glm(strikes ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = str_data)
mod_govc_cov_pois  <- glm(govcrises ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = govc_data)
mod_guer_cov_pois  <- glm(guerilla ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = guer_data)

# NB Model + Controls + Log GDP/PC
mod_riots_cov_nb <- glm.nb(riots ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, data = riot_data)
mod_purge_cov_nb <- glm.nb(purges ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, data = purge_data)
mod_rev_cov_nb <- glm.nb(revolutions ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, data = rev_data)
mod_demo_cov_nb <- glm.nb(demonstrations ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, data = demo_data)
mod_ass_cov_nb <- glm.nb(assassinations ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, data = ass_data)
mod_str_cov_nb <- glm.nb(strikes ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, data = str_data)
mod_govc_cov_nb <- glm.nb(govcrises ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, data = govc_data)
mod_guer_cov_nb <- glm.nb(guerilla ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, data = guer_data)

########################################
# Compute Cluster Robust Standard Errors
########################################

# Define function 
cluster_se <- function(dataset, mod){
  library(dplyr); library(sandwich); library(lmtest) # import relevant packages
  vars <- variable.names(mod) # getvariable from model
  vars <- subset(vars, (!vars == "(Intercept)")) # drop intercept
  vars <- c("cowcode", vars) # add cluster variable
  se_data <- tbl_df(dataset) %>% subset(select = vars) # subset data
  row_ids <- rownames(model.matrix(mod)) # rows in model matrix
  se_data <- subset(se_data, rownames(se_data) %in% row_ids) # drop rows not in model matrix
  countries <- as.matrix(se_data$cowcode) # extract cluster ID
  m <- length(unique(countries)) # number of clusters
  k <- length(coef(mod)) # number of IVs
  u <- estfun(mod) 
  u.clust <- matrix(NA, nrow=m, ncol=k)
  for(j in 1:k){
    u.clust[,j] <- tapply(u[,j], countries, sum)
  }
  bread <-vcov(mod) # extract variance-covariance matrix
  cl.vcov <- bread %*% ((m / (m-1)) * t(u.clust) # Make cluster robust matrix
                        %*% (u.clust)) %*% + bread
  coeftest(mod, cl.vcov)
}

# Update Standard Errors -- for Table 2
mod_riots_pois <- cluster_se(riot_data, mod_riots_pois)
mod_riots_cov_pois  <- cluster_se(riot_data, mod_riots_cov_pois) 
mod_riots_cov_nb  <- cluster_se(riot_data, mod_riots_cov_nb) 
mod_demo_pois <- cluster_se(riot_data, mod_demo_pois) 
mod_demo_cov_pois <- cluster_se(riot_data, mod_demo_cov_pois) 
mod_demo_cov_nb <- cluster_se(riot_data, mod_demo_cov_nb)

# Update Standard Errors -- for Appendix
mod_purge_pois <- cluster_se(riot_data, mod_purge_pois)
mod_purge_cov_pois  <- cluster_se(riot_data, mod_purge_cov_pois) 
mod_purge_cov_nb  <- cluster_se(riot_data, mod_purge_cov_nb) 
mod_rev_pois <- cluster_se(riot_data, mod_rev_pois) 
mod_rev_cov_pois <- cluster_se(riot_data, mod_rev_cov_pois) 
mod_rev_cov_nb <- cluster_se(riot_data, mod_rev_cov_nb)
mod_ass_pois <- cluster_se(riot_data, mod_ass_pois) 
mod_ass_cov_pois <- cluster_se(riot_data, mod_ass_cov_pois) 
mod_ass_cov_nb <- cluster_se(riot_data, mod_ass_cov_nb)
mod_str_pois <- cluster_se(riot_data, mod_str_pois) 
mod_str_cov_pois <- cluster_se(riot_data, mod_str_cov_pois) 
mod_str_cov_nb <- cluster_se(riot_data, mod_str_cov_nb)
mod_govc_pois <- cluster_se(riot_data, mod_govc_pois) 
mod_govc_cov_pois <- cluster_se(riot_data, mod_govc_cov_pois) 
mod_govc_cov_nb <- cluster_se(riot_data, mod_govc_cov_nb)
mod_guer_pois <- cluster_se(riot_data, mod_guer_pois) 
mod_guer_cov_pois <- cluster_se(riot_data, mod_guer_cov_pois) 
mod_guer_cov_nb <- cluster_se(riot_data, mod_guer_cov_nb)

###############
# Produce Table
###############

# Table 2
stargazer(mod_riots_pois, mod_riots_cov_pois, mod_riots_cov_nb, mod_demo_pois, mod_demo_cov_pois, mod_demo_cov_nb, 
          title="Poisson and Negative Binomial Regression of Riots and Peaceful Protests", 
          omit.stat = c("adj.rsq", "f"), digits = 4, omit = c("year", "cowcode"),
          notes = c("Note: $^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01"),
          notes.append = FALSE, notes.align = "l")

# Tables Not Presented in Text concerning Revolutions, Strikes, Guerilla Attacks, Gov. Crises, Purges, and Assassinations
stargazer(mod_rev_cov_pois, mod_rev_cov_nb, mod_str_cov_pois, mod_str_cov_nb,  mod_guer_cov_pois, mod_guer_cov_nb, 
          title="Poisson and Negative Binomial Regression of Revoltions, Strikes, and Armed Attacks", 
          omit.stat = c("adj.rsq", "f"), digits = 4, omit = c("year", "cowcode"),
          notes = c("Note: $^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01"),
          notes.append = FALSE, notes.align = "l")

stargazer(mod_govc_cov_pois, mod_govc_cov_nb, mod_purge_cov_pois, mod_purge_cov_nb,  mod_ass_cov_pois, mod_ass_cov_nb, 
          title="Poisson and Negative Binomial Regression of Gov. Crises, Gov. Purges, and Assassinations", 
          omit.stat = c("adj.rsq", "f"), digits = 4, omit = c("year", "cowcode"),
          notes = c("Note: $^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01"),
          notes.append = FALSE, notes.align = "l")


#################################################
# Marginal Effects Plot, Riots vs. Demonstrations
#################################################

######################################
# Using Poisson Regression - Model (2) 
######################################

# Model with Riots
mod_riots_cov_pois <- glm(riots ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = riot_data)
vars <- variable.names(mod_riots_cov_pois) # gets list of variable names and intercept
data_MEplot_riots <- riot_data[c('riots', vars[-1])]# adds DV and drops intercept form list of variables
data_MEplot_riots <- na.omit(data_MEplot_riots) # omits missing values
mean.covariates_riots <- apply(data_MEplot_riots[,-1], MARGIN = 2, mean)
mean.covariates_riots  <- c(1, mean.covariates_riots)
names(mean.covariates_riots)[1] <- c('(Intercept)')

set.seed(02138)
# Simulate from multivariate normal
library(mvtnorm)
simulated.beta_riots <- rmvnorm(5000, mean = coef(mod_riots_cov_pois), sigma = vcov(mod_riots_cov_pois)) 
print(dim(simulated.beta_riots))

plot.mdi_riots <- sapply(seq(min(data_MEplot_riots$mdi), max(data_MEplot_riots$mdi), length.out = 50), FUN=function(i){ 
  xb.i <- mean.covariates_riots
  xb.i[names(xb.i) == 'mdi'] <- i
  pr.i <- plogis(simulated.beta_riots %*% xb.i)
  return(c(i, quantile(pr.i, c(0.025, 0.5, 0.975)), mean(pr.i))) 
})
plot.mdi_riots <- data.frame(t(plot.mdi_riots))
names(plot.mdi_riots) <- c('mdi', 'p025', 'median', 'p975', 'mean')
# dev.off() # must run this to clear ROC curves
ggplot(plot.mdi_riots) + geom_line(aes(x=mdi,y=mean)) + 
  geom_ribbon(aes(x=mdi, ymin=p025,ymax=p975), alpha = 0.25) + 
  ylim(0,0.4)+
  xlab('MDI') + ylab('Probability of Riots') +  
  theme_bw() + theme(plot.title = element_text(hjust = 0.5), text = element_text(size = 12))
ggsave("MDI_riots.jpeg", plot = last_plot(), width = 7, height = 5, units = "in") 


# Model with Demonstrations
mod_demo_cov_pois <- glm(demonstrations ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, family='poisson', data = demo_data)
vars <- variable.names(mod_demo_cov_pois) # gets list of variable names and intercept
data_MEplot_demon <- demo_data[c('demonstrations', vars[-1])]# adds DV and drops intercept form list of variables
data_MEplot_demon <- na.omit(data_MEplot_demon) # omits missing values
mean.covariates_demon <- apply(data_MEplot_demon[,-1], MARGIN = 2, mean)
mean.covariates_demon  <- c(1, mean.covariates_demon)
names(mean.covariates_demon)[1] <- c('(Intercept)')

set.seed(02138)
# Simulate from multivariate normal
library(mvtnorm)
simulated.beta_demon <- rmvnorm(5000, mean = coef(mod_demo_cov_pois), sigma = vcov(mod_demo_cov_pois)) 
print(dim(simulated.beta_demon))

plot.mdi_demon <- sapply(seq(min(data_MEplot_demon$mdi), max(data_MEplot_demon$mdi), length.out = 50), FUN=function(i){ 
  xb.i <- mean.covariates_demon
  xb.i[names(xb.i) == 'mdi'] <- i
  pr.i <- plogis(simulated.beta_demon %*% xb.i)
  return(c(i, quantile(pr.i, c(0.025, 0.5, 0.975)), mean(pr.i))) 
})
plot.mdi_demon <- data.frame(t(plot.mdi_demon))
names(plot.mdi_demon) <- c('mdi', 'p025', 'median', 'p975', 'mean')
# dev.off() # must run this to clear ROC curves
ggplot(plot.mdi_demon) + geom_line(aes(x=mdi,y=mean)) + 
  geom_ribbon(aes(x=mdi, ymin=p025,ymax=p975), alpha = 0.25) + 
  xlab('MDI') + ylab('Probability of Peaceful Anti-Gov Demonstrations') +  
  coord_cartesian(ylim=c(0,0.4)) +
  theme_bw() + theme(plot.title = element_text(hjust = 0.5), text = element_text(size = 12))

ggsave("MDI_demonstrations.jpeg", plot = last_plot(), width = 7, height = 5, units = "in") 

####################################################
#NEGATIVE BINOMIAL REGRESSION - MODEL 3
####################################################
#model with riots
mod_riots_cov_nb <- glm.nb(riots ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, data = riot_data)

vars <- variable.names(mod_riots_cov_nb) # gets list of variable names and intercept
data_MEplot_riots <- riot_data[c('riots', vars[-1])]# adds DV and drops intercept form list of variables
data_MEplot_riots <- na.omit(data_MEplot_riots) # omits missing values

mean.covariates_riots <- apply(data_MEplot_riots[,-1], MARGIN = 2, mean)
mean.covariates_riots  <- c(1, mean.covariates_riots)
names(mean.covariates_riots)[1] <- c('(Intercept)')

set.seed(02138)
# Simulate from multivariate normal
library(mvtnorm)
simulated.beta_riots <- rmvnorm(5000, mean = coef(mod_riots_cov_nb), sigma = vcov(mod_riots_cov_nb)) 
print(dim(simulated.beta_riots))

plot.mdi_riots <- sapply(seq(min(data_MEplot_riots$mdi), max(data_MEplot_riots$mdi), length.out = 50), FUN=function(i){ 
  xb.i <- mean.covariates_riots
  xb.i[names(xb.i) == 'mdi'] <- i
  pr.i <- plogis(simulated.beta_riots %*% xb.i)
  return(c(i, quantile(pr.i, c(0.025, 0.5, 0.975)), mean(pr.i))) 
})
plot.mdi_riots <- data.frame(t(plot.mdi_riots))
names(plot.mdi_riots) <- c('mdi', 'p025', 'median', 'p975', 'mean')
# dev.off() # must run this to clear ROC curves
ggplot(plot.mdi_riots) + geom_line(aes(x=mdi,y=mean)) + 
  geom_ribbon(aes(x=mdi, ymin=p025,ymax=p975), alpha = 0.25) + 
  ylim(0,0.4)+
  xlab('MDI') + ylab('Probability of Riots') +  
  theme_bw() + theme(plot.title = element_text(hjust = 0.5), text = element_text(size = 12))

ggsave("MDI_riots_Negative_Bin.jpeg", plot = last_plot(), width = 7, height = 5, units = "in") 

#MODEL 

#model with demonstrations
mod_demo_cov_nb <- glm.nb(demonstrations ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + Lagged_DV, data = demo_data)
vars <- variable.names(mod_demo_cov_nb) # gets list of variable names and intercept
data_MEplot_demon <- demo_data[c('demonstrations', vars[-1])]# adds DV and drops intercept form list of variables
data_MEplot_demon <- na.omit(data_MEplot_demon) # omits missing values

mean.covariates_demon <- apply(data_MEplot_demon[,-1], MARGIN = 2, mean)
mean.covariates_demon  <- c(1, mean.covariates_demon)
names(mean.covariates_demon)[1] <- c('(Intercept)')

set.seed(02138)
# Simulate from multivariate normal
library(mvtnorm)
simulated.beta_demon <- rmvnorm(5000, mean = coef(mod_demo_cov_nb), sigma = vcov(mod_demo_cov_nb)) 
print(dim(simulated.beta_demon))

plot.mdi_demon <- sapply(seq(min(data_MEplot_demon$mdi), max(data_MEplot_demon$mdi), length.out = 50), FUN=function(i){ 
  xb.i <- mean.covariates_demon
  xb.i[names(xb.i) == 'mdi'] <- i
  pr.i <- plogis(simulated.beta_demon %*% xb.i)
  return(c(i, quantile(pr.i, c(0.025, 0.5, 0.975)), mean(pr.i))) 
})
plot.mdi_demon <- data.frame(t(plot.mdi_demon))
names(plot.mdi_demon) <- c('mdi', 'p025', 'median', 'p975', 'mean')
# dev.off() # must run this to clear ROC curves
ggplot(plot.mdi_demon) + geom_line(aes(x=mdi,y=mean)) + 
  geom_ribbon(aes(x=mdi, ymin=p025,ymax=p975), alpha = 0.25) + 
  xlab('MDI') + ylab('Probability of Peaceful Anti-Gov Demonstrations') +  
  coord_cartesian(ylim=c(0,0.4)) +
  theme_bw() + theme(plot.title = element_text(hjust = 0.5), text = element_text(size = 12))

ggsave("MDI_demonstrations_Negative_Bin.jpeg", plot = last_plot(), width = 7, height = 5, units = "in") 


#######################################
# CODE FROM REPLICATION OF WARREN(2014)
#######################################

#######################################
# Replication of Figure 1. Density Plot
#######################################

# Import Warren's cleaned data
rm(list = ls()) # clear workspace
data <- read_dta("Warren_IO_reg_data.dta") 

data_1989 <- subset(data, (year == 1989)) 
# Warren's Density Plot
key.country <- c('Rwanda', 'Yugoslavia')
ggdensity(data_1989, x = 'mdi',
          xlab = 'Media Density Index, 1989 (by country)',
          ylab = 'Kernel density',
          fill = "white", color = "black",
          label = "country", label.select = key.country, repel = TRUE,
          legend.title = ""       # Hide legend title
)


###############################
## Table 1. Logistic Regression  
###############################

# Get coefficients using GLM function 
model1 <- glm(onset ~ mdi+ lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + pcyrs + spline1 + spline2 + spline3, data = data, family=binomial(logit))
model2 <- glm(onset ~ mdi + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + pcyrs + spline1 + spline2 + spline3, data = data, family = binomial(logit))
model3 <- glm(onset ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + pcyrs + spline1 + spline2 + spline3, data = data, family = binomial(logit))
model4 <- glm(onset ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + telephli + pcyrs + spline1 + spline2 + spline3 , data = data, family = binomial(logit))
model5 <- glm(onset ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + litli + pcyrs + spline1 + spline2 + spline3, data = data, family = binomial(logit))
model6 <- glm(onset ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + seduli + pcyrs + spline1 + spline2 + spline3, data = data, family = binomial(logit))
model7 <- glm(onset ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl +  pfl + pcyrs + spline1 + spline2 + spline3, data = data, family = binomial(logit))

## Compute cluster robust standard errors by hand using Molly Roberts' code
# http://projects.iq.harvard.edu/files/gov2001/files/sesection_5.pdf
library(dplyr); library(sandwich)
cluster_se <- function(dataset, mod){
  library(dplyr); library(sandwich); library(lmtest) # import relevant packages
  vars <- variable.names(mod) # getvariable from model
  vars <- subset(vars, (!vars == "(Intercept)")) # drop intercept
  vars <- c("cowcode", vars) # add cluster variable
  se_data <- tbl_df(dataset) %>% subset(select = vars) # subset data
  se_data <- na.omit(se_data) # drop rows with missing values
  countries <- as.matrix(se_data$cowcode) # extract cluster ID
  m <- length(unique(countries)) # number of clusters
  k <- length(coef(mod)) # number of IVs
  u <- estfun(mod) 
  u.clust <- matrix(NA, nrow=m, ncol=k)
  for(j in 1:k){
    u.clust[,j] <- tapply(u[,j], countries, sum)
  }
  bread <-vcov(mod) # extract variance-covariance matrix
  cl.vcov <- bread %*% ((m / (m-1)) * t(u.clust) # Make cluster robust matrix
                        %*% (u.clust)) %*% + bread
  mod <- coeftest(mod, cl.vcov) # get test coefficients
  return(mod)
}
# Apply function to models
model1 <- cluster_se(data, model1)
model2 <- cluster_se(data, model2)
model3 <- cluster_se(data, model3)
model4 <- cluster_se(data, model4)
model5 <- cluster_se(data, model5)
model6 <- cluster_se(data, model6)
model7 <- cluster_se(data, model7)

# Produce Table 1 
table1 <- stargazer(model1, model2, model3, model4, model5, model6, model7,
                    title="Warren 2014 Table 1: Logistic Regression -- civil war onset", 
                    omit.stat = c("adj.rsq", "f"), omit = c("spline1", "spline2", "spline3"), digits = 4,
                    add.lines = list(c("Splines (1-3)", "N/S", "N/S", "N/S", "N/S", "N/S", "N/S", "N/s")),
                    notes = c("Note: All independent variables lagged by one year. Robust standard errors in parentheses.", 
                              "N/S indicates splines were included, but were not significant.", 
                              "$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01"),
                    notes.append = FALSE, notes.align = "l")


###############################################
## Replication of Figure 2. Substantive effects
###############################################

# Figure 2 is built from Model 3 in the paper. 
if (!require(Zelig)) {install.packages("Zelig"); require(Zelig)}
# Documentation for zelig, the R equivalent of clarify: https://cran.r-project.org/web/packages/Zelig/Zelig.pdf. 
# See page 77 logistic regression
form3 <- onset ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + pcyrs + spline1 + spline2 + spline3

z.out <- zelig(form3, model = "logit", data = data, cite = FALSE)
summary(z.out)
summary(z.out, odds_ratios = TRUE)

#mdi
mdi.low <- setx(z.out, mdi = quantile(data$mdi, prob = 0.05, na.rm = TRUE))
mdi.high <- setx(z.out, mdi = quantile(data$mdi, prob = 0.95, na.rm = TRUE))

s.out.mdi <- sim(z.out, x = mdi.low, x1 = mdi.high)
summary(s.out.mdi) # Almost the exact same value as in the paper -- 3.2% probability at low, 0.08 percent probability at high.
mdi.effect <- 0.03127347 # As he noted in the paper, the actual effect is negative. 
plot(s.out.mdi)

#population
pop.low <- setx(z.out, lpopl = quantile(data$lpopl, prob = 0.05, na.rm = TRUE))
pop.high <- setx(z.out, lpopl = quantile(data$lpopl, prob = 0.95, na.rm = TRUE))

s.out.pop <- sim(z.out, x = pop.low, x1 = pop.high)
summary(s.out.pop) # same
pop.effect <- 0.0166512
plot(s.out.pop)

#oil exporter
oil.low <- setx(z.out, oil2l = quantile(data$oil2l, prob = 0.05, na.rm = TRUE))
oil.high <- setx(z.out, oil2l = quantile(data$oil2l, prob = 0.95, na.rm = TRUE))

s.out.oil <- sim(z.out, x = oil.low, x1 = oil.high)
summary(s.out.oil) # same
oil.effect <- 0.01133236
plot(s.out.oil)

#democracy
dem.low <- setx(z.out, deml = 1)
dem.high <- setx(z.out, deml = 12)

s.out.dem <- sim(z.out, x = dem.low, x1 = dem.high)
summary(s.out.dem) # same
dem.effect <- 0.01060468
plot(s.out.dem)

#religious fractionalization
rel.low <- setx(z.out, relfracl = quantile(data$relfracl, prob = 0.05, na.rm = TRUE))
rel.high <- setx(z.out, relfracl = quantile(data$relfracl, prob = 0.95, na.rm = TRUE))

s.out.rel <- sim(z.out, x = rel.low, x1 = rel.high)
summary(s.out.rel) # same
rel.effect <- 0.01003504
plot(s.out.rel)

#mountainous terrain
mtn.low <- setx(z.out, lmtn = quantile(data$lmtn, prob = 0.05, na.rm = TRUE))
mtn.high <- setx(z.out, lmtn = quantile(data$lmtn, prob = 0.95, na.rm = TRUE))

s.out.mtn <- sim(z.out, x = mtn.low, x1 = mtn.high)
summary(s.out.mtn) # same
mtn.effect <- 0.006942174
plot(s.out.mtn)

effects <- c(mdi.effect, pop.effect, oil.effect, dem.effect, rel.effect, mtn.effect)
variables <- c("Media Density Index", "Population", "Oil Exporter", "Democracy", "Religious fract.", "Mountainous terrain")
effects.df <- data.frame(variables, effects)
levels(effects.df$variables)
effects.df$variables <- factor(effects.df$variables, 
                               levels = c("Media Density Index", "Population", "Oil Exporter", "Democracy", "Religious fract.", "Mountainous terrain"))
effects.df$color <- as.factor(ifelse(effects.df$variables == "Media Density Index", 1, 0))

effectsplot <- ggplot(effects.df, aes(x = variables, y = effects, fill = color)) + geom_col(width = .75) + 
  labs(x = "") + 
  scale_y_continuous(name = expression(paste(Delta, "Pr(Onset = 1)")),  
                     breaks = seq(0, 0.035, by = .005),
                     limits=c(0, .035)) +
  scale_fill_manual(values=c('gray80','gray35')) + guides(fill = FALSE) + 
  theme_bw() + theme(plot.title = element_text(hjust = 0.5), text = element_text(size = 11)) + 
  theme(panel.grid.minor = element_blank()) 

effectsplot
ggsave("Figure2.jpeg", plot = last_plot(), width = 7, height = 5, units = "in") 



#####################################################
## Replication of Figure 3. ROC Curves for Models 1-3
#####################################################

#Model1
library(ROCR)
model1 <- glm(onset ~ lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + pcyrs + spline1 + spline2 + spline3, data = data, family=binomial(logit))

data_roc1 <- data
data_roc1 <- tbl_df(data)%>%  
  subset(select = c(onset, lgdpl,larea, lmtn, lpopl, oil2l, deml, deml2, ethfracl, relfracl,pcyrs, spline1, spline2,spline3))
data_roc1 <- na.omit(data_roc1) # drop rows with missing values

predict1 <- predict(model1, type = "response")
pred1 <- prediction(predict1, data_roc1$onset)    
perf1 <- performance(pred1, measure = "tpr", x.measure = "fpr")     

#Check that AUC values match
auc_ROCR1 <- performance(pred1, measure = "auc")
auc_ROCR1 <- auc_ROCR1@y.values[[1]]
auc_ROCR1

#Model2
model2 <- glm(onset ~ mdi + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + pcyrs + spline1 + spline2 + spline3, data = data, family = binomial(logit))

data_roc2 <- data
data_roc2 <- tbl_df(data)%>%  
  subset(select = c(onset,mdi,larea,lmtn,lpopl,oil2l,deml,deml2,ethfracl,relfracl,pcyrs,spline1, spline2,spline3))
data_roc2 <- na.omit(data_roc2) # drop rows with missing values

predict2 <- predict(model2, type = "response")
pred2 <- prediction(predict2, data_roc2$onset)    
perf2 <- performance(pred2, measure = "tpr", x.measure = "fpr")  

#Check that AUC values match
auc_ROCR2 <- performance(pred2, measure = "auc")
auc_ROCR2 <- auc_ROCR2@y.values[[1]]
auc_ROCR2

#Model3
model3 <- glm(onset ~ mdi + lgdpl + larea + lmtn + lpopl + oil2l + deml + deml2 + ethfracl + relfracl + pcyrs + spline1 + spline2 + spline3, data = data, family = binomial(logit))

data_roc3 <- data
data_roc3 <- tbl_df(data)%>%  
  subset(select = c(onset,mdi,lgdpl,larea,lmtn,lpopl,oil2l,deml,deml2,ethfracl,relfracl, spline1, spline2,spline3))

data_roc3 <- na.omit(data_roc3) # drop rows with missing values

predict3 <- predict(model3, type = "response")
pred3 <- prediction(predict3, data_roc3$onset)    
perf3 <- performance(pred3, measure = "tpr", x.measure = "fpr")     

#Check that AUC values match
auc_ROCR3 <- performance(pred3, measure = "auc")
auc_ROCR3 <- auc_ROCR3@y.values[[1]]
auc_ROCR3

#Figure 3 from the paper - plot all three ROC curves
dev.off()
plot(perf1, col = "grey87" )
plot(perf2, add = TRUE, col = "gray52")
plot(perf3, add = TRUE, col="gray0")
abline(0, 1) 
legend("bottomright", lwd = 2, legend=c("Controls + GDP; AUC = 0.7366", "Controls + Media; AUC = 0.7576", "Controls + GDP + Media; AUC = 0.7597"), col=c("grey87", "grey52","grey0"),cex=0.70,pt.cex=0.70)
dev.copy(jpeg,filename="ROCplot.jpg")
dev.off ()


